import pandas as pd
import numpy as np
import io
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import altair
altair.data_transformers.disable_max_rows()
# Set style & figures inline
sns.set()
%matplotlib inline
medical_conditions_df = pd.read_csv('Covid19DataSet.csv')
confirmed_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('time_series_covid19_deaths_global.csv')
recovered_df = pd.read_csv('time_series_covid19_recovered_global.csv')
ccdf = confirmed_df.rename(columns={"Country/Region": "Location"}).groupby(['Location']).sum().drop(['Lat', 'Long'], axis=1)
ccdf.fillna(0,inplace=True)
crdf = recovered_df.rename(columns={"Country/Region": "Location"}).groupby(['Location']).sum().drop(['Lat', 'Long'], axis=1)
crdf.fillna(0,inplace=True)
cddf = deaths_df.rename(columns={"Country/Region": "Location"}).groupby(['Location']).sum().drop(['Lat', 'Long'], axis=1)
cddf.fillna(0,inplace=True)
cadf = pd.DataFrame(ccdf.iloc[:,:] - cddf.iloc[:,:] - crdf.iloc[:,:])
ccdf.head()
crdf.head()
cddf.head()
cadf.head()
# medical_conditions_df.info()
# country_cases = pd.DataFrame(medical_conditions_df, columns=['location','total_cases','total_deaths'])
# country_cases.index = country_cases['location']
# country_cases = country_cases.drop(['location'],axis=1)
# country_cases.fillna(0,inplace=True)
# country_cases
def generate_heatmap(data, title, color):
# temp_df = pd.DataFrame(data['Cases'])
temp_df = data.reset_index()
fig = px.choropleth(temp_df, locations="Location",
color=np.log10(temp_df["Cases"]+1), # lifeExp is a column of gapminder
hover_name="Location", # column to add to hover information
hover_data=["Cases"],
color_continuous_scale=px.colors.sequential.Plasma,locationmode="country names")
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(title_text=title)
fig.update_coloraxes(colorbar_title="Log Scale",colorscale=color)
# fig.to_image("Global Heat Map confirmed.png")
fig.show()
# Confirmed Cases Global Data
data = ccdf.iloc[:,-1:]
data = data.rename(columns={data.columns[-1]:"Cases"})
title = "Confirmed Covid-19 Cases Heat Map (Log Scale)"
color = 'fall'
generate_heatmap(data,title ,color)
# Death Cases Global Data
data = cddf.iloc[:,-1:]
data = data.rename(columns={data.columns[-1]:"Cases"})
title = "Covid-19 Death Cases Heat Map (Log Scale)"
color = 'Reds'
generate_heatmap(data,title ,color)
# Recovered Cases Global Data
data = crdf.iloc[:,-1:]
data = data.rename(columns={data.columns[-1]:"Cases"})
title = "Covid-19 Recovered Cases Heat Map (Log Scale)"
color = 'Greens'
generate_heatmap(data,title ,color)
# Active Cases Global Data
data = cadf.iloc[:,-1:]
data = data.rename(columns={data.columns[-1]:"Cases"})
title = "Covid-19 Active Cases Heat Map (Log Scale)"
color = 'jet'
generate_heatmap(data,title ,color)
# for analysis take top 50 countries
def pick_top_countries(data, top_country):
data = data.sort_values(data.columns[-1], ascending= False)
return data[:top_country]
top_country=50
ccdf1 = pick_top_countries(ccdf, top_country)
cadf1 = pick_top_countries(cadf, top_country)
crdf1 = pick_top_countries(crdf, top_country)
cddf1 = pick_top_countries(cddf, top_country)
# for plotting need to convert date into DateTime Index
def setDateTimeIndex(data):
data = data.transpose()
datetime_index = pd.DatetimeIndex(data.index)
data.set_index(datetime_index, inplace=True)
return data
ccdf1 = setDateTimeIndex(ccdf1)
cadf1 = setDateTimeIndex(cadf1)
crdf1 = setDateTimeIndex(crdf1)
cddf1 = setDateTimeIndex(cddf1)
# plot time series data
def plot_time_series_data(data, title_str, x_label, y_label, logy=False):
ax = data.plot(figsize=(20,10), linewidth=2, marker='.', fontsize=20, logy=logy)
ax.legend(ncol=3, loc='lower right')
plt.xlabel(x_label, fontsize=20);
plt.ylabel(y_label, fontsize=20);
plt.title(title_str, fontsize=20);
plot_time_series_data(ccdf1, 'Confirmed Covid-19 Cases (Top 50)', 'Days', 'Confimed Cases', logy=True)
plot_time_series_data(cadf1, 'Active Covid-19 Cases (Top 50)', 'Days', 'Active Cases (Log-Scale)', logy=True)
plot_time_series_data(cddf1, 'Covid-19 Death Cases (Top 50)', 'Days', 'Number of People Died (Log-Scale)', logy=True)
plot_time_series_data(cddf1, 'Covid-19 Recovered Cases (Top 50)', 'Days', 'Number of People Recovered (Log-Scale)', logy=True)
In all above plots our x-axis is not stable, all data need to be shifted at one point to look for trend, we are selecting this as 100 cases
# for certain countries x-coordinate (cases) mismatch,
# so lets drop all columns where cases are below some threshold
def align_data_to_minval(data, min_val):
# set values < min_val to NaNs
for col in data.columns:
data.loc[(data[col] < min_val),col] = None
# Delete columns with all NaNs
data.dropna(axis=1, how='all', inplace=True)
# Reset index, drop date
data = data.reset_index().drop(['index'], axis=1)
# Shift each column to begin with first valid index i.e bigger than threshold
for col in data.columns:
data[col] = data[col].shift(-data[col].first_valid_index())
return data
# delete all cases less than min_cases
min_cases = 100
tccdf = align_data_to_minval(ccdf1, min_cases)
tcadf = align_data_to_minval(cadf1, min_cases)
tcrdf = align_data_to_minval(crdf1, min_cases)
tcddf = align_data_to_minval(cddf1, min_cases)
plot_time_series_data(tccdf, 'Confirmed Covid-19 Cases (Top 50)', 'Days', 'Confimed Cases', logy=True)
plot_time_series_data(tcadf, 'Active Covid-19 Cases (Top 50)', 'Days', 'Active Cases (Log-Scale)', logy=True)
plot_time_series_data(tcddf, 'Covid-19 Death Cases (Top 50)', 'Days', 'Number of People Died (Log-Scale)', logy=True)
plot_time_series_data(tcddf, 'Covid-19 Recovered Cases (Top 50)', 'Days', 'Number of People Recovered (Log-Scale)', logy=True)
# Interactive ALTAIR Tool for Plotting
def interactive_plot(data, xlabel, ylabel):
daywise_data = data.reset_index().melt(id_vars='index', value_name=ylabel).rename(columns={'index':xlabel})
# start selection based on region
selection = altair.selection_single(fields=['Location'])
color = altair.condition(selection,altair.Color('Location:N'),altair.value('lightgray'))
# Base altair plot
base = altair.Chart(daywise_data).mark_line(strokeWidth=4, opacity=0.7).encode(
x=altair.X(xlabel),
y=altair.Y(ylabel, scale=altair.Scale(type='log')),
color='Location',
tooltip=['Location', xlabel,ylabel]
).properties(width=700, height=500)
# Chart
chart = base.encode(
color=altair.condition(selection, 'Location:N', altair.value('lightgray'))
).add_selection(selection)
# Overlay
overlay = base.encode(
color='Location',
opacity=altair.value(0.5),
tooltip=['Location:N', 'Name:N']
).transform_filter(selection)
# Sum em up!
return chart + overlay
interactive_plot(tccdf, 'Days', 'TotalCases')
interactive_plot(tcadf, 'Days', 'ActiveCases')
interactive_plot(tcrdf, 'Days', 'Recovered')
interactive_plot(tcddf, 'Days', 'Deaths')
We will perform further analysis on top 5 most affected countries in terms of death toll. In our analysis we will first perform plotting and then we will use forcasting for these countries. Meanwhile we will also discuss the medical and polytical aspects of these countries to fight the Covid-19 epidemic.
top_country=5
cddf2 = pick_top_countries(cddf, top_country)
Region_OF_INTREST = cddf2.transpose().columns.values.tolist()
Region_OF_INTREST
# countries selected are 'US', 'Brazil', 'United Kingdom', 'Mexico', 'Italy'
selected_confirmed_cases = ccdf.transpose()[Region_OF_INTREST].transpose()
selected_death_cases = cddf.transpose()[Region_OF_INTREST].transpose()
selected_recovered_cases = crdf.transpose()[Region_OF_INTREST].transpose()
selected_active_cases = cadf.transpose()[Region_OF_INTREST].transpose()
ccdf3 = setDateTimeIndex(selected_confirmed_cases)
cadf3 = setDateTimeIndex(selected_death_cases)
crdf3 = setDateTimeIndex(selected_recovered_cases)
cddf3 = setDateTimeIndex(selected_active_cases)
selected_confirmed_cases.head()
# delete all cases less than min_cases
min_cases = 10
tccdf = align_data_to_minval(ccdf3, min_cases)
tcadf = align_data_to_minval(cadf3, min_cases)
tcrdf = align_data_to_minval(crdf3, min_cases)
tcddf = align_data_to_minval(cddf3, min_cases)
interactive_plot(tccdf, 'Days', 'TotalCases')
interactive_plot(tcadf, 'Days', 'ActiveCases')
interactive_plot(tcrdf, 'Days', 'Recovered')
interactive_plot(tcddf, 'Days', 'Deaths')
Data Prepration
# medical_conditions_df.head(5)
# mddf = medical_conditions_df.rename(columns={"location": "Location"}).drop(['iso_code', 'continent'], axis=1)#.groupby(['Location']).sum()
# mddf.set_index()
# mddf.fillna(0,inplace=True)
# mddf.head()
confirmed_cases = selected_confirmed_cases.sum(axis=0)
confirmed_cases.index = pd.to_datetime(confirmed_cases.index)
death_cases = selected_death_cases.sum(axis=0)
death_cases.index = pd.to_datetime(death_cases.index)
recovered_cases = selected_recovered_cases.sum(axis=0)
recovered_cases.index = pd.to_datetime(recovered_cases.index)
active_cases = selected_active_cases.sum(axis=0)
active_cases.index = pd.to_datetime(active_cases.index)
plt.plot(confirmed_cases, label="Daily confirm cases")
plt.plot(death_cases, label="Daily death cases")
plt.plot(recovered_cases, label="Daily recovered cases")
plt.plot(active_cases, label="Daily active cases")
plt.legend();
plt.title("Cumulative Study");
confirmed_cases = confirmed_cases.diff().fillna(confirmed_cases[0]).transpose().astype(np.float64)
death_cases = death_cases.diff().fillna(death_cases[0]).transpose().astype(np.float64)
recovered_cases = recovered_cases.diff().fillna(recovered_cases[0]).transpose().astype(np.float64)
active_cases = active_cases.diff().fillna(active_cases[0]).transpose().astype(np.float64)
plt.plot(confirmed_cases, label="Daily confirm cases")
plt.plot(death_cases, label="Daily death cases")
plt.plot(recovered_cases, label="Daily recovered cases")
plt.plot(active_cases, label="Daily active cases")
plt.legend();
plt.title("Daily Trend");
import torch
from torch import nn, optim
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from pandas.plotting import register_matplotlib_converters
RANDOM_SEED = 100
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
def create_data_sequences(data, seq_length):
xs = []
ys = []
for i in range(len(data)-seq_length-1):
x = data[i:(i+seq_length)]
y = data[i+seq_length]
xs.append(x)
ys.append(y)
return np.array(xs), np.array(ys)
def data_prepration(data):
# Select 25 % test data
test_data_size = np.int(0.25*len(data))
train_data = data[:-test_data_size]
test_data = data[-test_data_size:]
# Normalize dataset with MinMaxScalar
scaler = MinMaxScaler()
scaler = scaler.fit(np.expand_dims(train_data, axis=1))
train_data = scaler.transform(np.expand_dims(train_data, axis=1))
test_data = scaler.transform(np.expand_dims(test_data, axis=1))
# create sequence as per testdata size
seq_length = np.int(test_data_size/2)
X_train, y_train = create_data_sequences(train_data, seq_length)
X_test, y_test = create_data_sequences(test_data, seq_length)
# convert numpy to torch
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()
return seq_length, train_data, test_data, scaler, X_train, y_train, X_test, y_test
class CasePredictor(nn.Module):
def __init__(self, n_features, n_hidden, seq_len, n_layers=2):
super(CasePredictor, self).__init__()
self.n_hidden = n_hidden
self.seq_len = seq_len
self.n_layers = n_layers
self.lstm = nn.LSTM(
input_size=n_features,
hidden_size=n_hidden,
num_layers=n_layers,
dropout=0.5
)
self.linear = nn.Linear(in_features=n_hidden, out_features=1)
def reset_hidden_state(self):
self.hidden = (
torch.zeros(self.n_layers, self.seq_len, self.n_hidden),
torch.zeros(self.n_layers, self.seq_len, self.n_hidden)
)
def forward(self, sequences):
lstm_out, self.hidden = self.lstm(
sequences.view(len(sequences), self.seq_len, -1),
self.hidden
)
last_time_step = \
lstm_out.view(self.seq_len, len(sequences), self.n_hidden)[-1]
y_pred = self.linear(last_time_step)
return y_pred
def train_model( model, train_data, train_labels, test_data=None, test_labels=None):
loss_fn = torch.nn.MSELoss()#reduction='sum'
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 460
train_hist = np.zeros(num_epochs)
test_hist = np.zeros(num_epochs)
# run training epoch
for t in range(num_epochs):
model.reset_hidden_state() # handle hidden state
y_pred = model(X_train) # Predict Label
loss = loss_fn(y_pred.float(), y_train) # Calculate Loss
if test_data is not None:# Evaluation
with torch.no_grad(): # stop backpropagation
y_test_pred = model(X_test)# Predict Label
test_loss = loss_fn(y_test_pred.float(), y_test) # Calulate test time loss
test_hist[t] = test_loss.item()
if t % 10 == 0:
print(f'Epoch {t} training loss: {loss.item()} test loss: {test_loss.item()}')
elif t % 10 == 0:
print(f'Epoch {t} training loss: {loss.item()}')
train_hist[t] = loss.item()
optimiser.zero_grad() # calculate gradient
loss.backward() # Backpropagate
optimiser.step() # adjust waights
return model.eval(), train_hist, test_hist
# Prepare Data
seq_length, train_data, test_data, scaler, X_train, y_train, X_test, y_test = data_prepration(confirmed_cases)
# Create a forecase model
forecast_model = CasePredictor(n_features=1, n_hidden=100, seq_len=seq_length, n_layers=2)
# Training
forecast_model, train_hist, test_hist = train_model( forecast_model, X_train, y_train, X_test, y_test )
plt.plot(train_hist, label="Training loss")
plt.plot(test_hist, label="Test loss")
plt.legend();
# torch.save(forecast_model,'covid_forecast_model.pt')
# lets forcast all test values to compare with ground
def forecasting(model, data, seq_length, DAYS_TO_PREDICT=None):
if DAYS_TO_PREDICT==None:
with torch.no_grad():
test_seq = data[:1]
preds = []
for _ in range(len(data)):
y_test_pred = model(test_seq)
pred = torch.flatten(y_test_pred).item()
preds.append(pred)
new_seq = test_seq.numpy().flatten()
new_seq = np.append(new_seq, [pred])
new_seq = new_seq[1:]
test_seq = torch.as_tensor(new_seq).view(1, seq_length, 1).float()
else:
with torch.no_grad():
test_seq = data[:1]
preds = []
for _ in range(DAYS_TO_PREDICT):
y_test_pred = model(test_seq)
pred = torch.flatten(y_test_pred).item()
preds.append(pred)
new_seq = test_seq.numpy().flatten()
new_seq = np.append(new_seq, [pred])
new_seq = new_seq[1:]
test_seq = torch.as_tensor(new_seq).view(1, seq_length, 1).float()
return test_seq, preds
# Remap predicted test values to origional format (Undo Normalization) as well as GT
test_seq, preds = forecasting(forecast_model, X_test, seq_length)
true_cases = scaler.inverse_transform(np.expand_dims(y_test.flatten().numpy(), axis=0)).flatten()
predicted_cases = scaler.inverse_transform(np.expand_dims(preds, axis=0)).flatten()
# Lets plot the predicted Results and GT
plt.plot(confirmed_cases.index[:len(train_data)], scaler.inverse_transform(train_data).flatten(),label='Historical Daily Cases')
plt.plot(confirmed_cases.index[len(train_data):len(train_data) + len(true_cases)], true_cases,label='Real Daily Cases')
plt.plot(confirmed_cases.index[len(train_data):len(train_data) + len(true_cases)], predicted_cases, label='Predicted Daily Cases')
plt.legend();
Lets forecast covid cases for next 60 days
scaler = MinMaxScaler()
scaler = scaler.fit(np.expand_dims(confirmed_cases, axis=1))
all_data = scaler.transform(np.expand_dims(confirmed_cases, axis=1))
X_all, y_all = create_data_sequences(all_data, seq_length)
X_all = torch.from_numpy(X_all).float()
y_all = torch.from_numpy(y_all).float()
DAYS_TO_PREDICT = 60
test_seq, preds = forecasting(forecast_model, X_test, seq_length, DAYS_TO_PREDICT)
predicted_cases = scaler.inverse_transform(np.expand_dims(preds, axis=0)).flatten()
predicted_index = pd.date_range(start=confirmed_cases.index[-1],periods=DAYS_TO_PREDICT + 1,closed='right')
predicted_cases = pd.Series(data=predicted_cases,index=predicted_index)
plt.plot(confirmed_cases, label='Historical Daily Cases')
plt.plot(predicted_cases, label='Future Daily Cases')
plt.legend();
# Generate Data
seq_length, train_data, test_data, scaler, X_train, y_train, X_test, y_test = data_prepration(death_cases)
# Create a forecase model
death_forecast_model = CasePredictor(n_features=1, n_hidden=512, seq_len=seq_length, n_layers=2)
# Training Death Forecase Model
forecast_model, train_hist, test_hist = train_model( death_forecast_model, X_train, y_train, X_test, y_test )
plt.plot(train_hist, label="Training loss")
plt.plot(test_hist, label="Test loss")
plt.legend();
torch.save(death_forecast_model,'death_forecast_model.pt')
# Remap predicted test values to origional format (Undo Normalization) as well as GT
test_seq, preds = forecasting(forecast_model, X_test, seq_length)
true_cases = scaler.inverse_transform(np.expand_dims(y_test.flatten().numpy(), axis=0)).flatten()
predicted_cases = scaler.inverse_transform(np.expand_dims(preds, axis=0)).flatten()
# Lets plot the predicted Results and GT
plt.plot(confirmed_cases.index[:len(train_data)], scaler.inverse_transform(train_data).flatten(),label='Historical Death Cases')
plt.plot(confirmed_cases.index[len(train_data):len(train_data) + len(true_cases)], true_cases,label='Real Death Cases')
plt.plot(confirmed_cases.index[len(train_data):len(train_data) + len(true_cases)], predicted_cases, label='Predicted Death Cases')
plt.legend();
scaler = MinMaxScaler()
scaler = scaler.fit(np.expand_dims(confirmed_cases, axis=1))
all_data = scaler.transform(np.expand_dims(confirmed_cases, axis=1))
X_all, y_all = create_data_sequences(all_data, seq_length)
X_all = torch.from_numpy(X_all).float()
y_all = torch.from_numpy(y_all).float()
DAYS_TO_PREDICT = 60
test_seq, preds = forecasting(death_forecast_model, X_test, seq_length, DAYS_TO_PREDICT)
predicted_cases = scaler.inverse_transform(np.expand_dims(preds, axis=0)).flatten()
predicted_index = pd.date_range(start=confirmed_cases.index[-1],periods=DAYS_TO_PREDICT + 1,closed='right')
predicted_cases = pd.Series(data=predicted_cases,index=predicted_index)
plt.plot(confirmed_cases, label='Historical Daily Death Cases')
plt.plot(predicted_cases, label='Future Daily Death Cases')
plt.legend();
country_cases = pd.DataFrame(medical_conditions_df, columns=["date", "location","total_cases","total_deaths","handwashing_facilities","hospital_beds_per_thousand"])
country_cases.fillna(0,inplace=True)
country_data = country_cases.groupby(["location"]).sum()
df_countries_cases = pd.DataFrame(country_data)
country_cases.head()
df_countries_cases.describe()